AirBnb

Load required packages.

library( tidyverse )
library( dplyr )
library( gridExtra )
library( mapview )
library( leaflet )
library( leaflet.extras )

Set up workspace, i.e., remove all existing data from working memory and load data from CSV file.

rm( list=ls() )
df <- read.csv("./data/airbnb_clean.csv")

Some general stuff

room_type_counts <- df %>%
  arrange(desc(room_type)) %>%
  count(room_type)

room_type_counts
##         room_type     n
## 1 Entire home/apt 25393
## 2    Private room 22306
## 3     Shared room  1159
ggplot(room_type_counts, aes(x = "", y = n, fill = room_type)) +
  geom_bar(stat = "identity", width = 1) +
  coord_polar("y", start = 0) +
  theme_void() #+

  #theme(legend.position="none") +
  #geom_text(aes(y = ypos, label = room_type), color = "white", size=6) +
  #scale_fill_brewer(palette="Set1")
ggplot(df, aes(x = last_review_age, y = price)) +
  geom_point()
## Warning: Removed 10037 rows containing missing values (`geom_point()`).

ggplot(df, aes(x = availability_365, y = price)) +
  geom_point()

ggplot(df, mapping = aes(y = log_price, x = room_type)) +
  geom_boxplot()
## Warning: Removed 11 rows containing non-finite values (`stat_boxplot()`).

df %>%
  ggplot(mapping = aes(x = room_type, y = log_price)) +
  geom_violin()
## Warning: Removed 11 rows containing non-finite values (`stat_ydensity()`).

df %>%
  filter(!is.na(last_review_age)) %>%
  mutate(log_last_review_age = log(as.integer(last_review_age))) %>%
  ggplot(mapping = aes(x = room_type, y = log_last_review_age)) +
  geom_violin()
## Warning: Removed 88 rows containing non-finite values (`stat_ydensity()`).

Geographic analysis

neighbourhood_group_counts <- df %>%
  count(neighbourhood_group)

ggplot(neighbourhood_group_counts, aes(x = "", y = n, fill = neighbourhood_group)) +
  geom_bar(stat = "identity", width = 1) +
  coord_polar("y", start = 0) +
  theme_void() #+

ggplot(df, aes(x = distance_from_center, y = price, color = neighbourhood_group)) +
  geom_point() +
  theme_minimal()

ggplot(df, mapping = aes(y = log_price, x = neighbourhood_group)) +
  geom_boxplot()
## Warning: Removed 11 rows containing non-finite values (`stat_boxplot()`).

#df %>%
#  filter(log_price >= 0) %>% 
#  ggplot(mapping = aes(x = log_price, y = neighbourhood_group)) +
#  geom_density_ridges(alpha = 0.5)
#df %>%
#  ggplot(mapping = aes(x = distance_from_center, y = neighbourhood_group)) +
#  geom_density_ridges(alpha = 0.5)
df %>%
  ggplot(mapping = aes(x = room_type, y = distance_from_center)) +
  geom_boxplot()

df %>%
  ggplot(mapping = aes(x = longitude, y = latitude, color = room_type)) +
  geom_point()

plots <- lapply(unique(df$room_type), function(r_type) {
  print(r_type)
  df %>%
    filter(room_type == r_type) %>%
    ggplot(mapping = aes(x = longitude, y = latitude)) +
    geom_point() +
    ggtitle(r_type)
})
## [1] "Private room"
## [1] "Entire home/apt"
## [1] "Shared room"
grid.arrange(grobs = plots, ncol = 3)

Most expensive neighbourhoods

get_most_expensive_neighbourhoods_in_group <- function(group) {
  most_expensive_neighbourhoods <- df %>%
    filter(neighbourhood_group == group) %>%
    group_by(neighbourhood) %>%
    summarize(sum_price = sum(price)) %>%
    arrange(desc(sum_price))
  
  tmp_df <- df %>%
      filter(neighbourhood_group == group) %>%
      group_by(neighbourhood, room_type) %>%
      summarise(sum_price = sum(price)) %>%
      arrange(match(neighbourhood, most_expensive_neighbourhoods$neighbourhood)) %>%
      mutate(neighbourhood = as.factor(neighbourhood)) %>%
      head(20)

  ?reorder
  return(
    tmp_df %>%
      ggplot(mapping = aes(x = factor(neighbourhood, level = most_expensive_neighbourhoods$neighbourhood), y = sum_price, fill = room_type)) +
      geom_col(stat = "identity") +
      labs(
        title = paste("Most expensive neighbourhoods in", group),
        x = "Neighbourhood",
        y = "Price in $"
      )
  )
}

Queens

get_most_expensive_neighbourhoods_in_group("Queens")
## `summarise()` has grouped output by 'neighbourhood'. You can override using the
## `.groups` argument.
## Warning in geom_col(stat = "identity"): Ignoring unknown parameters: `stat`

Brooklyn

get_most_expensive_neighbourhoods_in_group("Brooklyn")
## `summarise()` has grouped output by 'neighbourhood'. You can override using the
## `.groups` argument.
## Warning in geom_col(stat = "identity"): Ignoring unknown parameters: `stat`

Manhattan

get_most_expensive_neighbourhoods_in_group("Manhattan")
## `summarise()` has grouped output by 'neighbourhood'. You can override using the
## `.groups` argument.
## Warning in geom_col(stat = "identity"): Ignoring unknown parameters: `stat`

Staten Island

get_most_expensive_neighbourhoods_in_group("Staten Island")
## `summarise()` has grouped output by 'neighbourhood'. You can override using the
## `.groups` argument.
## Warning in geom_col(stat = "identity"): Ignoring unknown parameters: `stat`

Bronx

get_most_expensive_neighbourhoods_in_group("Bronx")
## `summarise()` has grouped output by 'neighbourhood'. You can override using the
## `.groups` argument.
## Warning in geom_col(stat = "identity"): Ignoring unknown parameters: `stat`

Queens

get_most_expensive_neighbourhoods_in_group("Queens")
## `summarise()` has grouped output by 'neighbourhood'. You can override using the
## `.groups` argument.
## Warning in geom_col(stat = "identity"): Ignoring unknown parameters: `stat`

Geographic Listing density

plot_map <- function (df, zoom, ratio) {
  center_lng <- (df %>% summarize(mean(longitude)))[1, 1]
  center_lat <- (df %>% summarize(mean(latitude)))[1, 1]
  
  map_max <- nrow(df) * ratio
  
  label_opt <- labelOptions(noHide = T, textsize = "10px")
  
  df %>%
    leaflet() %>%
    addTiles() %>%
    addProviderTiles(providers$OpenStreetMap.DE) %>%
    setView(center_lng, center_lat, zoom) %>%
    addHeatmap(lng = ~longitude, lat = ~latitude, max = map_max, radius = 20, blur = 10) %>%
    addCircleMarkers(lat = 40.6897, lng = -74.0445, label = "Statue of Liberty", labelOptions = label_opt) %>%
    addCircleMarkers(lat = 40.7484, lng = -73.9856, label = "Empire State Building") %>%
    addCircleMarkers(lat = 40.7826, lng = -73.9655, label = "Central Park") %>%
    addCircleMarkers(lat = 40.7579, lng = -73.9855, label = "Times Square") %>%
    addCircleMarkers(lat = 40.7061, lng = -73.9967, label = "Brooklyn Bridge")
}
plot_map(df, zoom = 10, ratio = 0.05)

Manhattan

manhattan <- df %>%
  filter(neighbourhood_group == "Manhattan")

plot_map(manhattan, 11.5, ratio = 0.02)

Bronx

bronx <- df %>%
  filter(neighbourhood_group == "Bronx")

plot_map(bronx, 11.5, ratio = 0.01)

Queens

queens <- df %>%
  filter(neighbourhood_group == "Queens")

plot_map(queens, 11, ratio = 0.02)
staten_island <- df %>%
  filter(neighbourhood_group == "Staten Island")

plot_map(staten_island, 11, ratio = 0.05)

Brooklyn

brooklyn <- df %>%
  filter(neighbourhood_group == "Brooklyn")

plot_map(brooklyn, 11, ratio = 0.05)

Geographic distribution of room types

Private room

private_room <- df %>%
  filter(room_type == "Private room")

plot_map(private_room, 11, ratio = 0.03)
center_lng <- (private_room %>% summarize(mean(longitude)))[1, 1]
center_lat <- (private_room %>% summarize(mean(latitude)))[1, 1]
  
private_room %>%
  leaflet() %>%
  addTiles() %>%
  addProviderTiles(providers$OpenStreetMap.DE) %>%
  setView(center_lng, center_lat, 10) %>%
  addMarkers(clusterOptions = markerClusterOptions())
## Assuming "longitude" and "latitude" are longitude and latitude, respectively

Entire home

entire_home <- df %>%
  filter(room_type == "Entire home/apt")

plot_map(entire_home, 11, ratio = 0.03)

Shared room

shared_room <- df %>%
  filter(room_type == "Shared room")

plot_map(shared_room, 11, ratio = 0.03)

Most expensive listing position

plot_marker_map <- function(df, zoom) {
  center_lng <- (most_expensive %>% summarize(mean(longitude)))[1, 1]
  center_lat <- (most_expensive %>% summarize(mean(latitude)))[1, 1]

  df %>%
    leaflet() %>%
    addTiles() %>%
    addProviderTiles(providers$OpenStreetMap.DE) %>%
    setView(center_lng, center_lat, zoom) %>%
    addCircleMarkers(lng = ~longitude, lat= ~latitude, label = ~name)
}
most_expensive <- df %>%
  arrange(desc(price)) %>%
  head(10)

plot_marker_map(most_expensive, 11)
least_expensive <- df %>%
  arrange(price) %>%
  head(10)

plot_marker_map(least_expensive, 11)
get_hull <- function(group) {
  df_points <- df %>%
    filter(neighbourhood_group == group) %>%
    select(longitude, latitude, neighbourhood_group)

  return(df_points[chull(df_points),])
}

manhattan_hull <- get_hull("Manhattan")
queens_hull <- get_hull("Queens")
bronx_hull <- get_hull("Bronx")
staten_island_hull <- get_hull("Staten Island")
brooklyn_hull <- get_hull("Brooklyn")

base_map <- df %>%
  ggplot(mapping = aes(x = longitude, y = latitude)) +
  geom_polygon(data = manhattan_hull, aes(x = longitude, y = latitude), fill = NA, color = "black") +
  geom_polygon(data = queens_hull, aes(x = longitude, y = latitude), fill = NA, color = "black") +
  geom_polygon(data = bronx_hull, aes(x = longitude, y = latitude), fill = NA, color = "black") +
  geom_polygon(data = staten_island_hull, aes(x = longitude, y = latitude), fill = NA, color = "black") +
  geom_polygon(data = brooklyn_hull, aes(x = longitude, y = latitude), fill = NA, color = "black")
base_map +
  geom_bin2d(bins = 500) +
  stat_density2d(
    aes(fill = ..level.., alpha = ..level..), 
    geom = "polygon"
  ) +
  scale_fill_gradient(low = "green", high = "red") +
  labs(title = "Distribution of Listings") +
  theme_minimal()
## Warning: The dot-dot notation (`..level..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(level)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

df %>%
  filter(neighbourhood_group == "Manhattan") %>%
  ggplot(mapping = aes(x = longitude, y = latitude)) +
  geom_polygon(data = manhattan_hull, aes(x = longitude, y = latitude), fill = NA, color = "black") +
  geom_bin2d(bins = 600) +
  stat_density2d(
    aes(fill = ..level.., alpha = ..level..), 
    geom = "polygon"
  ) +
  scale_fill_gradient(low = "green", high = "red") +
  labs(title = "Listing Density in Manhattan") +
  theme_minimal()

df %>%
  ggplot(mapping = aes(x = longitude, y = latitude)) +
  geom_bin2d(bins = 200) +
  stat_density2d(
    aes(fill = ..level.., alpha = ..level..), 
    geom = "polygon"
  ) +
  lims(x = c(-74.1, -73.8)) +
  scale_fill_gradient(low = "green", high = "red") +
  labs(title = "Distribution of Listings") +
  theme_minimal()
## Warning: Removed 979 rows containing non-finite values (`stat_bin2d()`).
## Warning: Removed 979 rows containing non-finite values (`stat_density2d()`).
## Warning: Removed 20 rows containing missing values (`geom_tile()`).

df %>%
  filter(neighbourhood_group == "Manhattan") %>%
  ggplot(mapping = aes(x = longitude, y = latitude)) +
  geom_bin2d(bins = 200) +
  scale_fill_gradient(low = "blue", high = "red") +
  labs(title = "Heatmap of Points") +
  theme_minimal()